package com.trytech.mongoocrawler.server.parser.lianjia;

import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONObject;
import com.trytech.mongoocrawler.server.common.queue.UrlFetcherEventProducer;
import com.trytech.mongoocrawler.server.entity.LianjiaItem;
import com.trytech.mongoocrawler.server.parser.HtmlParser;
import com.trytech.mongoocrawler.server.transport.http.UrlResult;
import com.trytech.mongoocrawler.server.transport.http.WebResult;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.nio.charset.Charset;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;

import static java.lang.Thread.sleep;

/**
 * 从网页中解析出链家房源的相关信息
 */
public class LianjiaHtmlParser extends HtmlParser<List<LianjiaItem>> {

    private static boolean isInited = true;

    public List<LianjiaItem> parse(WebResult webResult, UrlFetcherEventProducer urlProducer){
        try {
            String html = ((WebResult<String>) webResult).getData();
            List<LianjiaItem> itemList = new LinkedList<LianjiaItem>();
            Document doc = Jsoup.parse(html);
            doc.charset(Charset.forName("UTF-8"));
            Element body = doc.body();
            Elements elements = body.getElementsByClass("sellListContent");
            Element listEle = elements.first();
            Iterator<Element> ite = listEle.children().iterator();
            while (ite.hasNext()) {
                LianjiaItem item = new LianjiaItem();
                Element ele = ite.next();
                Element titleDiv = ele.getElementsByClass("title").first();
                Element titleEle = titleDiv.children().first();
                String title = titleEle.ownText();
                item.setTitle(title);

                Element addressDiv = ele.getElementsByClass("houseInfo").first();
                Element addressEle = addressDiv.getElementsByTag("a").first();
                String address = addressEle.text();
                item.setLocation(address);

                String addressText = addressDiv.text();
                String[] subAddress = addressText.split("\\|");
                String type = subAddress[1];
                item.setType(type);
                item.setFloorSpace(subAddress[2]);

                Element priceDiv = ele.getElementsByClass("totalPrice").first();
                Element priceEle = priceDiv.children().first();
                item.setPrice(Float.parseFloat(priceEle.ownText()));

                Element unitPriceDiv = ele.getElementsByClass("unitPrice").first();
                Element unitPriceEle = unitPriceDiv.children().first();
                item.setUnitPrice(Integer.parseInt(unitPriceEle.ownText().replaceAll("[^x00-xff]*", "").trim()));

                itemList.add(item);
            }
            if (isInited) {
                isInited = false;
                //获取总页数
                Element pageDiv = body.getElementsByClass("page-box house-lst-page-box").first();
                JSONObject pageJSON = JSON.parseObject(pageDiv.attr("page-data"));
                int totalCount = (int) pageJSON.get("totalPage");
                for (int i = 1; i <= totalCount; i++) {
                    urlProducer.sendData(new UrlResult("https://cd.lianjia.com/ershoufang/pg" + i + "/", new LianjiaHtmlParser()));
                    try {
                        sleep(1000);
                    } catch (InterruptedException e) {
                        e.printStackTrace();
                    }
                }

            }
            return itemList;
        }catch (Exception e){
            e.printStackTrace();
        }
        return null;
    }
}
